{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "is_executing": true
    },
    "ExecuteTime": {
     "start_time": "2024-05-21T20:07:59.314834Z"
    }
   },
   "source": [
    "import datasets\n",
    "import os\n",
    "import click\n",
    "import subprocess\n",
    "\n",
    "from text_dedup.utils import IOArgs\n",
    "from text_dedup.utils import MetaArgs\n",
    "from text_dedup.utils import MinHashArgs\n",
    "from text_dedup.minhash import main as minhash_main\n",
    "from text_dedup.utils.timer import Timer\n",
    "\n",
    "output_path_ds = \"temp_input_ds\"\n",
    "output_path_spark = \"temp_input_spark\"\n",
    "\n",
    "dataset = datasets.load_dataset(\"oscar-corpus/OSCAR-2201\", \"gl\", split=\"train\", trust_remote_code=True)\n",
    "dataset.save_to_disk(output_path_ds)\n",
    "\n",
    "os.makedirs(output_path_spark, exist_ok=True)\n",
    "dataset.to_pandas().to_parquet(output_path_spark + \"/data.parquet\")\n",
    "NUM_PROC = 4\n",
    "column = \"text\"\n",
    "num_perm = 250\n",
    "ngram = 5\n",
    "min_length = 0\n",
    "threshold = 0.7\n",
    "t = Timer()\n",
    "\n",
    "io_args = IOArgs(\n",
    "    path=output_path_ds,\n",
    "    local=True,\n",
    "    num_proc=NUM_PROC,\n",
    "    cache_dir=\".cache\",\n",
    "    output=\"./output_minhash\",\n",
    "    debug=True,\n",
    "    clean_cache=True,\n",
    ")\n",
    "meta_args = MetaArgs(column=column, batch_size=10000)\n",
    "\n",
    "with t(\"MinHash\"):\n",
    "    ctx = click.Context(minhash_main)\n",
    "    minhash_args = MinHashArgs(num_perm=num_perm, ngram=ngram, min_length=min_length, threshold=threshold)\n",
    "    io_args.output = minhash_output = \"./output_minhash\"\n",
    "    ctx.invoke(\n",
    "        minhash_main,\n",
    "        io_args=io_args,\n",
    "        meta_args=meta_args,\n",
    "        minhash_args=minhash_args,\n",
    "    )\n",
    "\n",
    "    with t(\"MinHash Spark\"):\n",
    "        spark_output = \"./output_spark\"\n",
    "        spark_args = f\"\"\"\n",
    "        spark-submit --executor-memory 8g\n",
    "            --driver-memory 8g\n",
    "            --executor-cores 2\n",
    "            --num-executors 2\n",
    "            --packages graphframes:graphframes:0.8.2-spark3.2-s_2.12\n",
    "            --conf spark.executor.extraJavaOptions=-Dlog4j.configuration=../log4j.properties\n",
    "            --conf spark.driver.extraJavaOptions=-Dlog4j.configuration=../log4j.properties\n",
    "            ../text_dedup/minhash_spark.py\n",
    "            --input ./{output_path_spark}\n",
    "            --output {spark_output}\n",
    "            --column {column}\n",
    "            --threshold {threshold}\n",
    "            --min_length {min_length}\n",
    "            --num_perm {num_perm}\n",
    "            --ngram {ngram}\n",
    "            --debug\n",
    "        \"\"\".split(\"\\n\")\n",
    "        subprocess.run(\n",
    "            [part.strip() for line in spark_args for part in line.strip().split(\" \") if part.strip()],\n",
    "        )  # nosec"
   ],
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/chenghao/miniforge3/envs/dedup/lib/python3.10/site-packages/datasets/load.py:1486: FutureWarning: The repository for oscar-corpus/OSCAR-2201 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/oscar-corpus/OSCAR-2201\n",
      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
      "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/88803 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "3e3691fdd6894b45a204437b0fbcc7d4"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Map (num_proc=4):   0%|          | 0/88803 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "b00b8b811eae4ae4824199e7089c85be"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Filter (num_proc=4):   0%|          | 0/88803 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "b2c91208b91e43428608494cc9850322"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Fingerprinting... (num_proc=4):   0%|          | 0/88803 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "d2f3fd1160e34411a684d57f036360e4"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Iterating MinHashes...: 100%|██████████| 9/9 [00:02<00:00,  3.08it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "\u001B[2;36m[05/21/24 21:08:27]\u001B[0m\u001B[2;36m \u001B[0m\u001B[34mINFO    \u001B[0m Number of clusters: \u001B[1;36m25\u001B[0m                                                  \u001B]8;id=631;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/minhash.py\u001B\\\u001B[2mminhash.py\u001B[0m\u001B]8;;\u001B\\\u001B[2m:\u001B[0m\u001B]8;id=429909;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/minhash.py#265\u001B\\\u001B[2m265\u001B[0m\u001B]8;;\u001B\\\n"
      ],
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[05/21/24 21:08:27] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Number of clusters: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">25</span>                                                  <a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/minhash.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">minhash.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/minhash.py#265\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">265</span></a>\n",
       "</pre>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Clustering...: 100%|██████████| 25/25 [00:01<00:00, 23.55it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "\u001B[2;36m[05/21/24 21:08:28]\u001B[0m\u001B[2;36m \u001B[0m\u001B[34mINFO    \u001B[0m Number of edges: \u001B[1;36m58489\u001B[0m                                                  \u001B]8;id=788440;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/minhash.py\u001B\\\u001B[2mminhash.py\u001B[0m\u001B]8;;\u001B\\\u001B[2m:\u001B[0m\u001B]8;id=32690;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/minhash.py#275\u001B\\\u001B[2m275\u001B[0m\u001B]8;;\u001B\\\n"
      ],
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[05/21/24 21:08:28] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Number of edges: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">58489</span>                                                  <a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/minhash.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">minhash.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/minhash.py#275\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">275</span></a>\n",
       "</pre>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Finding clusters... (num_proc=4):   0%|          | 0/88803 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "075edde37f454b54b354f2a90e9a4e22"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Filtering clusters... (num_proc=4):   0%|          | 0/88803 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "65586619f30146c1b559d38b498a7c21"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/44099 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "4e764c5b0b0249f89b3c6f201e7d99dc"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001B[2;36m[05/21/24 21:08:31]\u001B[0m\u001B[2;36m \u001B[0m\u001B[34mINFO    \u001B[0m Loading                         : \u001B[1;36m2.\u001B[0m93s                                    \u001B]8;id=998524;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py\u001B\\\u001B[2mtimer.py\u001B[0m\u001B]8;;\u001B\\\u001B[2m:\u001B[0m\u001B]8;id=794937;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py#65\u001B\\\u001B[2m65\u001B[0m\u001B]8;;\u001B\\\n"
      ],
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[05/21/24 21:08:31] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Loading                         : <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2.</span>93s                                    <a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">timer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py#65\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">65</span></a>\n",
       "</pre>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001B[2;36m                   \u001B[0m\u001B[2;36m \u001B[0m\u001B[34mINFO    \u001B[0m MinHashing                      : \u001B[1;36m18.\u001B[0m30s                                   \u001B]8;id=957533;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py\u001B\\\u001B[2mtimer.py\u001B[0m\u001B]8;;\u001B\\\u001B[2m:\u001B[0m\u001B]8;id=482903;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py#65\u001B\\\u001B[2m65\u001B[0m\u001B]8;;\u001B\\\n"
      ],
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> MinHashing                      : <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">18.</span>30s                                   <a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">timer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py#65\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">65</span></a>\n",
       "</pre>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001B[2;36m                   \u001B[0m\u001B[2;36m \u001B[0m\u001B[34mINFO    \u001B[0m Clustering                      : \u001B[1;36m4.\u001B[0m04s                                    \u001B]8;id=190383;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py\u001B\\\u001B[2mtimer.py\u001B[0m\u001B]8;;\u001B\\\u001B[2m:\u001B[0m\u001B]8;id=702789;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py#65\u001B\\\u001B[2m65\u001B[0m\u001B]8;;\u001B\\\n"
      ],
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Clustering                      : <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">4.</span>04s                                    <a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">timer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py#65\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">65</span></a>\n",
       "</pre>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001B[2;36m                   \u001B[0m\u001B[2;36m \u001B[0m\u001B[34mINFO    \u001B[0m Filtering                       : \u001B[1;36m2.\u001B[0m21s                                    \u001B]8;id=330908;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py\u001B\\\u001B[2mtimer.py\u001B[0m\u001B]8;;\u001B\\\u001B[2m:\u001B[0m\u001B]8;id=650274;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py#65\u001B\\\u001B[2m65\u001B[0m\u001B]8;;\u001B\\\n"
      ],
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Filtering                       : <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2.</span>21s                                    <a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">timer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py#65\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">65</span></a>\n",
       "</pre>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001B[2;36m                   \u001B[0m\u001B[2;36m \u001B[0m\u001B[34mINFO    \u001B[0m Saving                          : \u001B[1;36m0.\u001B[0m28s                                    \u001B]8;id=711742;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py\u001B\\\u001B[2mtimer.py\u001B[0m\u001B]8;;\u001B\\\u001B[2m:\u001B[0m\u001B]8;id=863151;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py#65\u001B\\\u001B[2m65\u001B[0m\u001B]8;;\u001B\\\n"
      ],
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Saving                          : <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.</span>28s                                    <a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">timer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py#65\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">65</span></a>\n",
       "</pre>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001B[2;36m                   \u001B[0m\u001B[2;36m \u001B[0m\u001B[34mINFO    \u001B[0m Cleaning                        : \u001B[1;36m0.\u001B[0m01s                                    \u001B]8;id=188492;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py\u001B\\\u001B[2mtimer.py\u001B[0m\u001B]8;;\u001B\\\u001B[2m:\u001B[0m\u001B]8;id=289814;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py#65\u001B\\\u001B[2m65\u001B[0m\u001B]8;;\u001B\\\n"
      ],
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Cleaning                        : <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.</span>01s                                    <a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">timer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py#65\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">65</span></a>\n",
       "</pre>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001B[2;36m                   \u001B[0m\u001B[2;36m \u001B[0m\u001B[34mINFO    \u001B[0m Total                           : \u001B[1;36m27.\u001B[0m77s                                   \u001B]8;id=96915;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py\u001B\\\u001B[2mtimer.py\u001B[0m\u001B]8;;\u001B\\\u001B[2m:\u001B[0m\u001B]8;id=568938;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py#65\u001B\\\u001B[2m65\u001B[0m\u001B]8;;\u001B\\\n"
      ],
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Total                           : <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">27.</span>77s                                   <a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">timer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/utils/timer.py#65\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">65</span></a>\n",
       "</pre>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001B[2;36m                   \u001B[0m\u001B[2;36m \u001B[0m\u001B[34mINFO    \u001B[0m Before                          : \u001B[1;36m88803\u001B[0m                                 \u001B]8;id=290224;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/minhash.py\u001B\\\u001B[2mminhash.py\u001B[0m\u001B]8;;\u001B\\\u001B[2m:\u001B[0m\u001B]8;id=715242;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/minhash.py#308\u001B\\\u001B[2m308\u001B[0m\u001B]8;;\u001B\\\n"
      ],
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Before                          : <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">88803</span>                                 <a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/minhash.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">minhash.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/minhash.py#308\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">308</span></a>\n",
       "</pre>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001B[2;36m                   \u001B[0m\u001B[2;36m \u001B[0m\u001B[34mINFO    \u001B[0m After                           : \u001B[1;36m44099\u001B[0m                                 \u001B]8;id=115319;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/minhash.py\u001B\\\u001B[2mminhash.py\u001B[0m\u001B]8;;\u001B\\\u001B[2m:\u001B[0m\u001B]8;id=761389;file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/minhash.py#309\u001B\\\u001B[2m309\u001B[0m\u001B]8;;\u001B\\\n"
      ],
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> After                           : <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">44099</span>                                 <a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/minhash.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">minhash.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///Users/chenghao/Downloads/Workspace/text-dedup/text_dedup/minhash.py#309\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">309</span></a>\n",
       "</pre>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ":: loading settings :: url = jar:file:/Users/chenghao/miniforge3/envs/dedup/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Ivy Default Cache set to: /Users/chenghao/.ivy2/cache\n",
      "The jars for the packages stored in: /Users/chenghao/.ivy2/jars\n",
      "graphframes#graphframes added as a dependency\n",
      ":: resolving dependencies :: org.apache.spark#spark-submit-parent-481f3b0b-867d-4704-a87e-be5699d67bbd;1.0\n",
      "\tconfs: [default]\n",
      "\tfound graphframes#graphframes;0.8.2-spark3.2-s_2.12 in spark-packages\n",
      "\tfound org.slf4j#slf4j-api;1.7.16 in central\n",
      ":: resolution report :: resolve 94ms :: artifacts dl 3ms\n",
      "\t:: modules in use:\n",
      "\tgraphframes#graphframes;0.8.2-spark3.2-s_2.12 from spark-packages in [default]\n",
      "\torg.slf4j#slf4j-api;1.7.16 from central in [default]\n",
      "\t---------------------------------------------------------------------\n",
      "\t|                  |            modules            ||   artifacts   |\n",
      "\t|       conf       | number| search|dwnlded|evicted|| number|dwnlded|\n",
      "\t---------------------------------------------------------------------\n",
      "\t|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |\n",
      "\t---------------------------------------------------------------------\n",
      ":: retrieving :: org.apache.spark#spark-submit-parent-481f3b0b-867d-4704-a87e-be5699d67bbd\n",
      "\tconfs: [default]\n",
      "\t0 artifacts copied, 2 already retrieved (0kB/3ms)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DEBUG __main__ - ------------------------------------------------------------------------------------------------------------------------\n",
      "DEBUG __main__ - Using B=25, R=10\n",
      "DEBUG __main__ - Loaded documents: 88803\n",
      "DEBUG __main__ - args.input='./temp_input_spark'\n",
      "DEBUG __main__ - args.output='./output_spark'\n",
      "DEBUG __main__ - args.threshold=0.7\n",
      "DEBUG __main__ - args.ngram_size=5\n",
      "DEBUG __main__ - args.min_length=0\n",
      "DEBUG __main__ - args.num_perm=250\n",
      "DEBUG __main__ - args.column='text'\n",
      "DEBUG __main__ - id                                                              : bigint\n",
      "DEBUG __main__ - text                                                            : string\n",
      "DEBUG __main__ - meta                                                            : struct<annotations:array<string>,identification:struct<label:string,prob:double>,line_identifications:array<struct<label:string,prob:double>>,warc_headers:struct<content-length:bigint,content-type:string,warc-block-digest:string,warc-date:string,warc-identified-content-language:string,warc-record-id:string,warc-refers-to:string,warc-target-uri:string,warc-type:string>>\n",
      "DEBUG __main__ - __id__                                                          : bigint\n",
      "DEBUG __main__ - ------------------------------------------------------------------------------------------------------------------------\n"
     ]
    }
   ],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "",
   "id": "40cbd8b098762f85"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
